import numpy as np
import pandas as pd
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
import plotly.graph_objects as go
def load_data():
df = pd.read_csv(r'E:\Lectures\526DATA INT VIS ANALYT\P\daily_cases.csv')
df = df.set_index('Date')
df = df.sort_index()
return df['Count']
def test_stationarity(timeseries):
# rolling statistics
rolling_mean = timeseries.rolling(window=28).mean()
rolling_std = timeseries.rolling(window=28).std()
# rolling statistics plot
fig = go.Figure()
fig.add_trace(go.Scatter(
name="Original",
mode="lines", x=timeseries.index, y=timeseries
))
fig.add_trace(go.Scatter(
name="Rolling Mean",
mode="lines", x=rolling_mean.index, y=rolling_mean
))
fig.add_trace(go.Scatter(
name="Rolling Std",
mode="lines", x=rolling_std.index, y=rolling_std
))
fig.show()
# Dickey–Fuller test:
result = adfuller(timeseries)
print('ADF Statistic: {}'.format(result[0]))
print('p-value: {}'.format(result[1]))
print('Critical Values:')
for key, value in result[4].items():
print('\t{}: {}'.format(key, value))
return result
ARIMA is an acronym that stands for Auto-Regressive Integrated Moving Average. It is a class of model that captures a suite of different standard temporal structures in time series data. To implement ARIMA model, we should make sure the origianl data is stationary. A stationary time series data is one whose properties do not depend on the time, That is why time series with trends, or with seasonality, are not stationary.
df = load_data()
test_stationarity(df)
ADF Statistic: -1.6818291524085807 p-value: 0.4404981208795331 Critical Values: 1%: -3.4312035349155057 5%: -2.861917198999832 10%: -2.566970778512368
(-1.6818291524085807,
0.4404981208795331,
36,
7664,
{'1%': -3.4312035349155057,
'5%': -2.861917198999832,
'10%': -2.566970778512368},
88672.92164897689)
Here we perform adfuller test. We will be considering the null hypothesis that data is not stationary and the alternate hypothesis that data is stationary. p-value is greater than 0.05, we should accept null hypothesis. thus the original dataset is non-stationary.
df_log = np.log(df)
rolling_mean = df_log.rolling(window=28).mean()
df_log_minus_mean = df_log - rolling_mean
df_log_minus_mean.dropna(inplace=True)
result = test_stationarity(df_log_minus_mean)
ADF Statistic: -13.480784009240596 p-value: 3.2444682573686062e-25 Critical Values: 1%: -3.4312065535348215 5%: -2.8619185328129757 10%: -2.5669714885183432
by taking log and minus rolling mean of last 7 days. P-value is below 0.05 which means we should reject null hypothesis, thus log and minus rolling mean is stationary. We also find the dataset appears to be seasonal, so we will use SARIMA instead of ARIMA, SARIMA will help us to capture the seasonal movement of the data.
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = plot_acf(df_log_minus_mean,lags=10,ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(df_log_minus_mean,lags=10,ax=ax2)
ACF lag 2 is significant(above 0.2), we set MA to 2 PACF lag 1 is significant , we set AR to 1
smodel = sm.tsa.statespace.SARIMAX(df,order=(1, 1, 2),seasonal_order=(1,0,1,28))
results=smodel.fit()
print(results.summary())
D:\Anaconda3\envs\arima\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used. D:\Anaconda3\envs\arima\lib\site-packages\statsmodels\tsa\base\tsa_model.py:159: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
SARIMAX Results
============================================================================================
Dep. Variable: Count No. Observations: 7701
Model: SARIMAX(1, 1, 2)x(1, 0, [1], 28) Log Likelihood -44449.615
Date: Tue, 19 Apr 2022 AIC 88911.231
Time: 21:21:08 BIC 88952.925
Sample: 01-01-2001 HQIC 88925.529
- 01-31-2022
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.3870 0.039 10.020 0.000 0.311 0.463
ma.L1 -1.0044 0.039 -25.644 0.000 -1.081 -0.928
ma.L2 0.0921 0.033 2.751 0.006 0.026 0.158
ar.S.L28 0.9974 0.001 1501.196 0.000 0.996 0.999
ma.S.L28 -0.9694 0.004 -258.768 0.000 -0.977 -0.962
sigma2 6021.3619 33.788 178.208 0.000 5955.138 6087.586
===================================================================================
Ljung-Box (Q): 206.57 Jarque-Bera (JB): 85973.24
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.42 Skew: 1.15
Prob(H) (two-sided): 0.00 Kurtosis: 19.21
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
p-values of parameters < 0.05. means they are all significant.
results.plot_diagnostics()
Top left: The residual errors seem to fluctuate around a mean of zero and have a uniform variance.
Top Right: The density plot suggest normal distribution with mean zero.
Bottom left: All the dots should fall perfectly in line with the red line. Any significant deviations would imply the distribution is skewed.
Bottom Right: The Correlogram, aka, ACF plot shows the residual errors are not autocorrelated. Any autocorrelation would imply that there is some pattern in the residual errors which are not explained in the model.
Our model seems like a good fit.
df_original = df[-396:]
n_periods = 333
prediction = results.predict(start=7701,end=7701+n_periods,dynamic=True)
fig = go.Figure()
fig.add_trace(go.Scatter(
name="Original",
mode="lines", x=df_original.index, y=df_original
))
fig.add_trace(go.Scatter(
name="Prediction",
mode="lines", x=prediction.index, y=prediction
))
fig.show()